This is just a test of an initial idea.

Setup

Imagine a known binary tree that describes the relationship between a set of species. We know the relationship but we do not know the individual strength of the effects. We would like to be able to utilize the information on the relationship between the species for later classification and/or prediction. In the figure below the boxes represent obsered abundances while the circles are latent (unobserved) variables. We assume that we have a total of \(N\) simultaneous measurements of each of the observed variables.

Estimating the relationship between taxa

library(mvtnorm)
N <- 100
df <- data.frame(H=rnorm(N),
                 I=rnorm(N),
                 J=rnorm(N),
                 K=rnorm(N),
                 L=rnorm(N),
                 M=rnorm(N),
                 N=rnorm(N),
                 O=rnorm(N)
                 )

Sigma <- diag(8)
Sigma[1,2] <- Sigma[2,1] <- .5
Sigma[3,4] <- Sigma[4,3] <- .5
Sigma[5,6] <- Sigma[6,5] <- .5
Sigma[7,8] <- Sigma[8,7] <- .5

mydat <- data.frame(rmvnorm(3, mean=rep(0, 8), Sigma))


Sigma <- diag(4)
Sigma[1,2] <- Sigma[2,1] <- .5
Sigma[3,4] <- Sigma[4,3] <- .5

mydat <- data.frame(rmvnorm(N, mean=rep(0, 4), Sigma))
names(mydat) <- c("D", "E", "F", "G")
head(mydat)
##             D          E          F          G
## 1  0.85333598  0.8104226 -0.4877673  0.7180980
## 2  0.60244507  0.7445297 -0.1078784 -0.4226488
## 3 -1.12122919 -1.0012946 -0.2616403  0.1201814
## 4  1.54148198  1.2870652  1.0655272  0.9354132
## 5  0.06851309 -1.4266270 -1.4125591 -0.1234586
## 6  0.67985843  1.7121311  1.5092601  0.6533469
#names(mydat) <- names(df)
head(mydat)
##             D          E          F          G
## 1  0.85333598  0.8104226 -0.4877673  0.7180980
## 2  0.60244507  0.7445297 -0.1078784 -0.4226488
## 3 -1.12122919 -1.0012946 -0.2616403  0.1201814
## 4  1.54148198  1.2870652  1.0655272  0.9354132
## 5  0.06851309 -1.4266270 -1.4125591 -0.1234586
## 6  0.67985843  1.7121311  1.5092601  0.6533469
library(sem)
qqqqqmodel.taxa <- specifyModel(text="
A<->A, theta1, NA
A->B, gam1, NA
A->C, gam1, NA
B->D, gam2, NA
B->E, gam2, NA
C->F, gam3, NA
C->G, gam3, NA
D->H, gam4, NA
D->I, gam4, NA
E->J, gam5, NA
E->K, gam5, NA
F->L, gam6, NA
F->M, gam6, NA
G->N, gam7, NA
G->O, gam7, NA
")
## NOTE: it is generally simpler to use specifyEquations() or cfa()
##       see ?specifyEquations
## NOTE: adding 14 variances to the model
model.taxa2 <- specifyModel(text="
A<->A, theta1, NA
A->B, gam1, NA
A->C, NA, 1
B->D, NA,   1
B->E, gam1, NA
C->F, NA,   1
C->G, gam1, NA
")
## NOTE: it is generally simpler to use specifyEquations() or cfa()
##       see ?specifyEquations
## NOTE: adding 6 variances to the model
EmpVar <- var(mydat)
EmpVar
##             D           E          F           G
## D  0.95884763  0.46356679 -0.2114149 -0.09470069
## E  0.46356679  0.89860182 -0.1156579 -0.08704957
## F -0.21141490 -0.11565794  1.0164024  0.61803530
## G -0.09470069 -0.08704957  0.6180353  0.96543561
res <- sem(model.taxa2, S=EmpVar, N=N, maxiter=50, quiet=FALSE)


#EmpVar <- var(df)
#EmpVar
#head(df)
#res <- sem(model.taxa, data=mydat, maxiter=50)

#res <- sem(model.taxa, EmpVar, N=N, maxiter=50)
# str(res)
res
## 
##  Model Chisquare =  0.3080346   Df =  2 
## 
##     theta1       gam1       V[B]       V[C]       V[D]       V[E] 
## -0.4464748  0.4763963  1.0711234  1.7473364 -0.0109707  0.6770224 
##       V[F]       V[G] 
## -0.2838383  0.6716702 
## 
##  Iterations =  46
res$C
##            D           E          F           G
## D  0.9588238  0.46200645 -0.2126989 -0.10132896
## E  0.4620064  0.89712058 -0.1013290 -0.04827274
## F -0.2126989 -0.10132896  1.0170234  0.61972564
## G -0.1013290 -0.04827274  0.6197256  0.96690523
res$A
##   D E F G         A         B         C
## D 0 0 0 0 0.0000000 1.0000000 0.0000000
## E 0 0 0 0 0.0000000 0.4763963 0.0000000
## F 0 0 0 0 0.0000000 0.0000000 1.0000000
## G 0 0 0 0 0.0000000 0.0000000 0.4763963
## A 0 0 0 0 0.0000000 0.0000000 0.0000000
## B 0 0 0 0 0.4763963 0.0000000 0.0000000
## C 0 0 0 0 1.0000000 0.0000000 0.0000000
#sem(model.taxa, data=df)

Extending the model to classification/prediction